library(dplyr)
library(ggplot2)
library(tidytext)
library(rtweet)
load("data/transcripts.RData")
load("data/ratings.RData")
Inicialmente, minha ideia era utilizar dados do twitter (texto e hora do tweet) de alguma partida de futebol e comparar a frequência de determinadas palavras com os eventos do jogo.
No entanto, um problema que surgiu foi que a api gratuita do tweet aparentemente não retorna tweets de maneira constante ao decorrer da partida. Segue um exemplo em que temos tweets apenas para alguns intervalos de tempo do jogo.
tweets = parse_stream("CEAxFLU.json")
ts_plot(tweets, by = "mins") + # função do rtweet que utiliza o ggplot
theme_bw()
Este problema me fez desistir da proposta inicial e buscar outras opções para o trabalho final.
Alguns meses atrás, navegando no Reddit, encontrei uma visualização que me chamou bastante atenção:
na época, eu estava estudando mineração de textos e tive a ideia de utilizar métodos de text mining para buscar palavras que identifiquem cada personagem e avaliar a relação entre a quantidade de falas de um personagem em um episódio com a nota do episódio no IMDB. Como eu nunca assisti The Office, a série desta visualização, decidi utilizar os dados da série Avatar: The Last Airbender (ATLA).
Os datasets que serão utilizados são:
O script dos 61 episódios do ATLA disponÃvel em: http://avatar.wikia.com/wiki/Avatar_Wiki:Transcripts;
As notas do IMDB desta série que pode ser encontrada em: https://www.imdb.com/title/tt0417299/.
Ambos os conjuntos de dados foram coletados via web scraping. O código do scrape está no arquivo scrape.R.
Colunas do data frame dos roteiros
glimpse(transcripts)
## Observations: 9,992
## Variables: 4
## $ speaker <chr> "Katara", "Sokka", "Katara", "Sokka", "Katara", "Kata...
## $ text <chr> "Water. Earth. Fire. Air. My grandmother used to tell...
## $ epi_num <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ epi_name <fct> The Boy in the Iceberg, The Boy in the Iceberg, The B...
Notas dos episódios
ratings
## [1] 8.1 8.3 8.5 8.2 8.2 8.0 8.2 8.9 8.0 7.9 7.1 9.0 9.1 8.0 7.9 8.5 8.1
## [18] 8.6 9.4 9.7 8.7 8.2 8.5 7.8 7.6 9.1 9.4 8.9 8.5 8.8 8.7 8.4 9.0 8.7
## [35] 9.2 8.8 9.0 8.6 9.0 9.6 8.7 8.4 8.0 8.8 8.5 9.4 8.2 9.1 7.8 9.1 9.4
## [52] 8.9 9.1 8.7 9.2 8.9 8.6 9.1 9.5 9.7 9.8
Inicialmente, realizarei um pre-processamento nos textos, removerei as introduções e tudo que estiver entre colchetes.
Exemplo de texto antes da limpeza
transcripts$text[2]
## [1] "It's not getting away from me this time. [Close-up of the boy as he grins confidently over his shoulder in the direction of the girl.] Watch and learn, Katara. This is how you catch a fish.\n"
intro = transcripts$text[1]
transcripts = transcripts %>%
filter(text != intro) %>% # tirando as introducoes
mutate(text = stringr::str_replace_all(text, '\\[(.*?)\\]', ''), # tirando o que ta entre colchetes
text = qdapRegex::rm_number(text))
Exemplo de texto depois da limpeza
transcripts$text[1] # Ãndice 1 pois removi a introdução
## [1] "It's not getting away from me this time. Watch and learn, Katara. This is how you catch a fish."
Agora, colocarei o texto no formato de um data frame de uma palavra por linha, com todas as palavras em caixa baixa e removerei as seguintes stop words
sort(unique(stop_words$word))
## [1] "a" "a's" "able" "about"
## [5] "above" "according" "accordingly" "across"
## [9] "actually" "after" "afterwards" "again"
## [13] "against" "ain't" "all" "allow"
## [17] "allows" "almost" "alone" "along"
## [21] "already" "also" "although" "always"
## [25] "am" "among" "amongst" "an"
## [29] "and" "another" "any" "anybody"
## [33] "anyhow" "anyone" "anything" "anyway"
## [37] "anyways" "anywhere" "apart" "appear"
## [41] "appreciate" "appropriate" "are" "area"
## [45] "areas" "aren't" "around" "as"
## [49] "aside" "ask" "asked" "asking"
## [53] "asks" "associated" "at" "available"
## [57] "away" "awfully" "b" "back"
## [61] "backed" "backing" "backs" "be"
## [65] "became" "because" "become" "becomes"
## [69] "becoming" "been" "before" "beforehand"
## [73] "began" "behind" "being" "beings"
## [77] "believe" "below" "beside" "besides"
## [81] "best" "better" "between" "beyond"
## [85] "big" "both" "brief" "but"
## [89] "by" "c" "c'mon" "c's"
## [93] "came" "can" "can't" "cannot"
## [97] "cant" "case" "cases" "cause"
## [101] "causes" "certain" "certainly" "changes"
## [105] "clear" "clearly" "co" "com"
## [109] "come" "comes" "concerning" "consequently"
## [113] "consider" "considering" "contain" "containing"
## [117] "contains" "corresponding" "could" "couldn't"
## [121] "course" "currently" "d" "definitely"
## [125] "described" "despite" "did" "didn't"
## [129] "differ" "different" "differently" "do"
## [133] "does" "doesn't" "doing" "don't"
## [137] "done" "down" "downed" "downing"
## [141] "downs" "downwards" "during" "e"
## [145] "each" "early" "edu" "eg"
## [149] "eight" "either" "else" "elsewhere"
## [153] "end" "ended" "ending" "ends"
## [157] "enough" "entirely" "especially" "et"
## [161] "etc" "even" "evenly" "ever"
## [165] "every" "everybody" "everyone" "everything"
## [169] "everywhere" "ex" "exactly" "example"
## [173] "except" "f" "face" "faces"
## [177] "fact" "facts" "far" "felt"
## [181] "few" "fifth" "find" "finds"
## [185] "first" "five" "followed" "following"
## [189] "follows" "for" "former" "formerly"
## [193] "forth" "four" "from" "full"
## [197] "fully" "further" "furthered" "furthering"
## [201] "furthermore" "furthers" "g" "gave"
## [205] "general" "generally" "get" "gets"
## [209] "getting" "give" "given" "gives"
## [213] "go" "goes" "going" "gone"
## [217] "good" "goods" "got" "gotten"
## [221] "great" "greater" "greatest" "greetings"
## [225] "group" "grouped" "grouping" "groups"
## [229] "h" "had" "hadn't" "happens"
## [233] "hardly" "has" "hasn't" "have"
## [237] "haven't" "having" "he" "he'd"
## [241] "he'll" "he's" "hello" "help"
## [245] "hence" "her" "here" "here's"
## [249] "hereafter" "hereby" "herein" "hereupon"
## [253] "hers" "herself" "hi" "high"
## [257] "higher" "highest" "him" "himself"
## [261] "his" "hither" "hopefully" "how"
## [265] "how's" "howbeit" "however" "i"
## [269] "i'd" "i'll" "i'm" "i've"
## [273] "ie" "if" "ignored" "immediate"
## [277] "important" "in" "inasmuch" "inc"
## [281] "indeed" "indicate" "indicated" "indicates"
## [285] "inner" "insofar" "instead" "interest"
## [289] "interested" "interesting" "interests" "into"
## [293] "inward" "is" "isn't" "it"
## [297] "it'd" "it'll" "it's" "its"
## [301] "itself" "j" "just" "k"
## [305] "keep" "keeps" "kept" "kind"
## [309] "knew" "know" "known" "knows"
## [313] "l" "large" "largely" "last"
## [317] "lately" "later" "latest" "latter"
## [321] "latterly" "least" "less" "lest"
## [325] "let" "let's" "lets" "like"
## [329] "liked" "likely" "little" "long"
## [333] "longer" "longest" "look" "looking"
## [337] "looks" "ltd" "m" "made"
## [341] "mainly" "make" "making" "man"
## [345] "many" "may" "maybe" "me"
## [349] "mean" "meanwhile" "member" "members"
## [353] "men" "merely" "might" "more"
## [357] "moreover" "most" "mostly" "mr"
## [361] "mrs" "much" "must" "mustn't"
## [365] "my" "myself" "n" "name"
## [369] "namely" "nd" "near" "nearly"
## [373] "necessary" "need" "needed" "needing"
## [377] "needs" "neither" "never" "nevertheless"
## [381] "new" "newer" "newest" "next"
## [385] "nine" "no" "nobody" "non"
## [389] "none" "noone" "nor" "normally"
## [393] "not" "nothing" "novel" "now"
## [397] "nowhere" "number" "numbers" "o"
## [401] "obviously" "of" "off" "often"
## [405] "oh" "ok" "okay" "old"
## [409] "older" "oldest" "on" "once"
## [413] "one" "ones" "only" "onto"
## [417] "open" "opened" "opening" "opens"
## [421] "or" "order" "ordered" "ordering"
## [425] "orders" "other" "others" "otherwise"
## [429] "ought" "our" "ours" "ourselves"
## [433] "out" "outside" "over" "overall"
## [437] "own" "p" "part" "parted"
## [441] "particular" "particularly" "parting" "parts"
## [445] "per" "perhaps" "place" "placed"
## [449] "places" "please" "plus" "point"
## [453] "pointed" "pointing" "points" "possible"
## [457] "present" "presented" "presenting" "presents"
## [461] "presumably" "probably" "problem" "problems"
## [465] "provides" "put" "puts" "q"
## [469] "que" "quite" "qv" "r"
## [473] "rather" "rd" "re" "really"
## [477] "reasonably" "regarding" "regardless" "regards"
## [481] "relatively" "respectively" "right" "room"
## [485] "rooms" "s" "said" "same"
## [489] "saw" "say" "saying" "says"
## [493] "second" "secondly" "seconds" "see"
## [497] "seeing" "seem" "seemed" "seeming"
## [501] "seems" "seen" "sees" "self"
## [505] "selves" "sensible" "sent" "serious"
## [509] "seriously" "seven" "several" "shall"
## [513] "shan't" "she" "she'd" "she'll"
## [517] "she's" "should" "shouldn't" "show"
## [521] "showed" "showing" "shows" "side"
## [525] "sides" "since" "six" "small"
## [529] "smaller" "smallest" "so" "some"
## [533] "somebody" "somehow" "someone" "something"
## [537] "sometime" "sometimes" "somewhat" "somewhere"
## [541] "soon" "sorry" "specified" "specify"
## [545] "specifying" "state" "states" "still"
## [549] "sub" "such" "sup" "sure"
## [553] "t" "t's" "take" "taken"
## [557] "tell" "tends" "th" "than"
## [561] "thank" "thanks" "thanx" "that"
## [565] "that's" "thats" "the" "their"
## [569] "theirs" "them" "themselves" "then"
## [573] "thence" "there" "there's" "thereafter"
## [577] "thereby" "therefore" "therein" "theres"
## [581] "thereupon" "these" "they" "they'd"
## [585] "they'll" "they're" "they've" "thing"
## [589] "things" "think" "thinks" "third"
## [593] "this" "thorough" "thoroughly" "those"
## [597] "though" "thought" "thoughts" "three"
## [601] "through" "throughout" "thru" "thus"
## [605] "to" "today" "together" "too"
## [609] "took" "toward" "towards" "tried"
## [613] "tries" "truly" "try" "trying"
## [617] "turn" "turned" "turning" "turns"
## [621] "twice" "two" "u" "un"
## [625] "under" "unfortunately" "unless" "unlikely"
## [629] "until" "unto" "up" "upon"
## [633] "us" "use" "used" "useful"
## [637] "uses" "using" "usually" "uucp"
## [641] "v" "value" "various" "very"
## [645] "via" "viz" "vs" "w"
## [649] "want" "wanted" "wanting" "wants"
## [653] "was" "wasn't" "way" "ways"
## [657] "we" "we'd" "we'll" "we're"
## [661] "we've" "welcome" "well" "wells"
## [665] "went" "were" "weren't" "what"
## [669] "what's" "whatever" "when" "when's"
## [673] "whence" "whenever" "where" "where's"
## [677] "whereafter" "whereas" "whereby" "wherein"
## [681] "whereupon" "wherever" "whether" "which"
## [685] "while" "whither" "who" "who's"
## [689] "whoever" "whole" "whom" "whose"
## [693] "why" "why's" "will" "willing"
## [697] "wish" "with" "within" "without"
## [701] "won't" "wonder" "work" "worked"
## [705] "working" "works" "would" "wouldn't"
## [709] "x" "y" "year" "years"
## [713] "yes" "yet" "you" "you'd"
## [717] "you'll" "you're" "you've" "young"
## [721] "younger" "youngest" "your" "yours"
## [725] "yourself" "yourselves" "z" "zero"
ut = transcripts %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) # retirando stop words
## Joining, by = "word"
glimpse(ut)
## Observations: 34,602
## Variables: 4
## $ speaker <chr> "Sokka", "Sokka", "Sokka", "Sokka", "Sokka", "Sokka",...
## $ epi_num <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ epi_name <fct> The Boy in the Iceberg, The Boy in the Iceberg, The B...
## $ word <chr> "time", "watch", "learn", "katara", "catch", "fish", ...
Contagem de palavras por personagem (após remoção de stop words):
count_speaker = ut %>%
count(speaker) %>%
arrange(desc(n))
count_speaker
## # A tibble: 358 x 2
## speaker n
## <chr> <int>
## 1 Sokka 5521
## 2 Aang 5027
## 3 Katara 4102
## 4 Zuko 2374
## 5 Iroh 1658
## 6 Toph 1489
## 7 Azula 944
## 8 Zhao 566
## 9 Jet 455
## 10 Hakoda 341
## # ... with 348 more rows
count_speaker$Selecionado = "Não"
count_speaker$Selecionado[1:18] = "Sim"
count_speaker %>%
mutate(speaker = factor(speaker, levels = speaker)) %>%
head(36) %>%
ggplot(aes(x = speaker, y = n, alpha = Selecionado)) +
geom_bar(stat = "identity") +
xlab("Personagem") +
ylab("Quantidade de palavras") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.3)) +
scale_alpha_discrete(range = c(0.4, 0.9)) +
ggtitle("Palavras por personagens e personagens selecionados") +
guides(alpha = guide_legend(reverse = TRUE))
Optei por utilizar apenas os 18 personagens com maior número de palavras e calcular as palavras mais utilizadas e a medida tf-idf considerando cada personagem como um documento. A medida tf-idf nos permite avaliar palavras que caracterizam um determinado documento, neste caso personagem, em relação a um conjunto de documentos. Ela é calculada da seguinte forma:
\[ w_{t,d} = \frac{tf_{t,d}}{\sum_{t' \in d}f_{t',d}} \times log \Bigg( \frac{N}{n_t} \Bigg) \]
em que
\(w_{t,d}\) é o valor do o peso para o termo \(t\) no documento \(d\);
\(tf_{t,d}\) é a quantidade de vezes que o termo \(t\) aparece no documento \(d\);
\(\sum_{t' \in d}f_{t',d}\) é a quantidade total de termos do documento \(d\);
\(N\) é o número total de documentos;
\(n_t\) é o número de documentos que contém o termo \(t\).
personagens = count_speaker$speaker[1:18]
tfidf = ut %>%
filter(speaker %in% personagens) %>%
count(word, speaker) %>%
bind_tf_idf(word, speaker, n) %>%
arrange(desc(tf_idf))
Visualizando as palavras mais utilizadas e os maiores tf-idf por personagem
tmp1 = tibble(speaker = count_speaker$speaker[1:18],
order_speaker = 1:18)
tmp2 = tfidf %>%
inner_join(tmp1) %>%
mutate(speaker = reorder(speaker, order_speaker)) %>%
arrange(speaker, desc(n)) %>%
group_by(speaker) %>%
do(head(., 10)) %>%
ungroup() %>%
arrange(speaker, n) %>%
mutate(order_word = row_number())
## Joining, by = "speaker"
tmp3 = tfidf %>%
inner_join(tmp1) %>%
mutate(speaker = reorder(speaker, order_speaker)) %>%
arrange(speaker, desc(tf_idf)) %>%
group_by(speaker) %>%
do(head(., 10)) %>%
ungroup() %>%
arrange(speaker, tf_idf) %>%
mutate(order_word = row_number())
## Joining, by = "speaker"
Selecionando cores das nações com uso do https://imagecolorpicker.com/ e a partir da imagem
water = "#27A3EB"
fire = "#D7333C"
earth = "#6FCC36"
#air = "#FFE700"
air = "gold1"
ggplot(tmp2, aes(order_word, n, fill = speaker)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~ speaker, scales = "free", ncol = 6) +
theme_bw() +
scale_x_continuous(breaks = tmp2$order_word, labels = tmp2$word) +
scale_y_continuous(breaks = scales::pretty_breaks(2)) +
coord_flip() +
labs(title = "Palavras mais utilizadas por personagens",
x = NULL,
y = "n") +
scale_fill_manual(values = c(water, air, water, fire, fire, earth,
fire, fire, earth, water, air, fire,
fire, earth, water, earth, earth, fire))
ggplot(tmp3, aes(order_word, tf_idf, fill = speaker)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~ speaker, scales = "free", ncol = 6) +
xlab("words") +
ylab("tf-idf") +
theme_bw() +
scale_x_continuous(breaks = tmp3$order_word, labels = tmp3$word) +
scale_y_continuous(breaks = scales::pretty_breaks(2)) +
coord_flip() +
labs(title = "Palavras com maiores tf-idf por personagem",
x = NULL,
y = "tf-idf") +
scale_fill_manual(values = c(water, air, water, fire, fire, earth,
fire, fire, earth, water, air, fire,
fire, earth, water, earth, earth, fire))
Primeiramente, agrupei as informações das notas com a base do roteiro e criei uma variável para indicar a fração da quantidade de falas dos personagens em relação ao total de falas do episódio.
tmp = tibble(epi_num = unique(transcripts$epi_num), rating = ratings)
prop = transcripts %>%
inner_join(tmp) %>%
filter(speaker %in% personagens) %>%
count(epi_num, rating, speaker) %>%
group_by(epi_num) %>%
mutate(prop = n/sum(n))
## Joining, by = "epi_num"
glimpse(prop)
## Observations: 387
## Variables: 5
## Groups: epi_num [61]
## $ epi_num <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ rating <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.3, 8.3, 8.3, 8.3, 8.3, 8.5,...
## $ speaker <chr> "Aang", "Iroh", "Katara", "Sokka", "Zuko", "Aang", "Ir...
## $ n <int> 47, 7, 50, 31, 8, 30, 3, 35, 33, 12, 52, 13, 32, 32, 2...
## $ prop <dbl> 0.32867133, 0.04895105, 0.34965035, 0.21678322, 0.0559...
Por exemplo o personagem Aang tem 47 falas no eÃsódio 1, o que corresponde a aproximadamente 32,6% das falas deste episódio.
Posteriormente, criei uma base com as notas dos episódios.
episodios = tibble(Episódio = as.factor(1:61),
Nota = ratings,
Temporada = as.factor(c(rep(1, 20), rep(2, 20), rep(3, 21))))
episodios %>%
ggplot(aes(x = Episódio, y = Nota, fill = Temporada)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c(water, earth, fire)) +
theme_bw() +
coord_cartesian(ylim = c(7, 10)) +
scale_x_discrete(breaks = seq(1, 61, 2)) +
ggtitle("Nota por episódio")
Finalmente, calculei a correlação de Spearman entre a proporção de falas do personagem e a nota do IMDB. Preferi utilizar esta medida de correlação ao invés da correlação de Pearson pois não espero que a correlação entre estas variáveis seja linear.
freq2vec <- function(x) {
out = NULL
for(i in 1:61) {
tmp = prop %>%
filter(speaker == x,
epi_num == i) %>%
.$prop
out[i] = ifelse(length(tmp) == 0, 0, tmp)
}
out
}
mat_freq = ratings %>%
cbind(sapply(personagens, freq2vec))
colnames(mat_freq)[1] = "rating"
cor_rating = cor(mat_freq, method = "spearman")[-1,1]
sort(cor_rating, decreasing = TRUE)
## Zuko Ozai Toph Azula Hakoda Suki
## 0.44011830 0.40957611 0.35787257 0.30859284 0.23609850 0.22193422
## Iroh Roku Zhao Bumi Long Feng Mai
## 0.21074101 0.17174359 0.16627787 0.15974255 0.15108082 0.12588468
## Hama Pathik Jet Sokka Aang Katara
## 0.12122290 0.06229572 -0.05067584 -0.38144198 -0.42776043 -0.60494221
tibble(spearman = cor_rating, speaker = names(cor_rating)) %>%
mutate(speaker = reorder(speaker, spearman)) %>%
ggplot(aes(speaker, spearman)) +
geom_bar(stat = "identity") +
xlab("Personagem") +
ylab("Correlação de Spearman") +
ggtitle("Correlações de Spearman \nentre frequências de falas do personagem por episódio e notas do IMDB") +
theme_bw() +
scale_y_continuous(breaks = scales::pretty_breaks(3))
Construir uma visualização para as correlações;
Adicionar visualizações intermediárias;
Como as principais palavras utilizadas por personagens são os nomes de outros personagens, construir um diagrama de Chord para visualizar a quantidade de vezes que cada personagem utilizou o nome de outro em suas falas;
Adicionar comentários/interpretações para as visualizações.
Testar hierarquical ordering nas matrizes de frequência e/ou tf-idf.
mat = reshape2::acast(tfidf, word ~ speaker, value.var = "tf", fill = 0)
rownames(mat) = rep("", nrow(mat))
dim(mat)
## [1] 4777 18
heatmap(mat)
library(circlize) # https://jokergoo.github.io/circlize_book/book/
grid.col = c(Aang = air, Azula = fire, Bumi = earth, Feng = earth, Hakoda = water,
Hama = water, Iroh = fire, Jet = earth, Katara = water, Mai = fire,
Ozai = fire, Pathik = air, Roku = fire, Sokka = water, Suki = earth,
Toph = earth, Zhao = fire, Zuko = fire)
tmp = tfidf %>%
filter(word %in% stringr::str_to_lower(c(personagens, "feng"))) %>%
mutate(speaker = stringr::str_to_lower(speaker))
tmp$speaker[which(tmp$speaker == "long feng")] = "feng"
mat = reshape2::acast(tmp, speaker ~ word, value.var = "n", fill = 0)
dim(mat)
## [1] 18 18
colnames(mat)
## [1] "aang" "azula" "bumi" "feng" "hakoda" "hama" "iroh"
## [8] "jet" "katara" "mai" "ozai" "pathik" "roku" "sokka"
## [15] "suki" "toph" "zhao" "zuko"
colnames(mat) = c("Aang", "Azula", "Bumi", "Feng", "Hakoda", "Hama", "Iroh",
"Jet", "Katara", "Mai", "Ozai", "Pathik", "Roku", "Sokka",
"Suki", "Toph", "Zhao", "Zuko")
rownames(mat) = colnames(mat)
Utilizando n
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat, grid.col = grid.col,
directional = 1, annotationTrack = c("name", "grid"))
mat = textshape::cluster_matrix(mat)
Utilizando n e reordenando
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat, grid.col = grid.col,
directional = 1, annotationTrack = c("name", "grid"))
tmp = tfidf %>%
filter(word %in% stringr::str_to_lower(c(personagens, "feng"))) %>%
mutate(speaker = stringr::str_to_lower(speaker))
tmp$speaker[which(tmp$speaker == "long feng")] = "feng"
mat = reshape2::acast(tmp, speaker ~ word, value.var = "tf", fill = 0)
dim(mat)
## [1] 18 18
colnames(mat)
## [1] "aang" "azula" "bumi" "feng" "hakoda" "hama" "iroh"
## [8] "jet" "katara" "mai" "ozai" "pathik" "roku" "sokka"
## [15] "suki" "toph" "zhao" "zuko"
colnames(mat) = c("Aang", "Azula", "Bumi", "Feng", "Hakoda", "Hama", "Iroh",
"Jet", "Katara", "Mai", "Ozai", "Pathik", "Roku", "Sokka",
"Suki", "Toph", "Zhao", "Zuko")
rownames(mat) = colnames(mat)
Utilizando tf
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat, grid.col = grid.col,
directional = 1, annotationTrack = c("name", "grid"))
mat = textshape::cluster_matrix(mat)
Utilizando tf e reordenando
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat, grid.col = grid.col,
directional = 1, annotationTrack = c("name", "grid"))
mat = reshape2::acast(tmp, speaker ~ word, value.var = "n", fill = 0)
dim(mat)
## [1] 18 18
colnames(mat)
## [1] "aang" "azula" "bumi" "feng" "hakoda" "hama" "iroh"
## [8] "jet" "katara" "mai" "ozai" "pathik" "roku" "sokka"
## [15] "suki" "toph" "zhao" "zuko"
colnames(mat) = c("Aang", "Azula", "Bumi", "Feng", "Hakoda", "Hama", "Iroh",
"Jet", "Katara", "Mai", "Ozai", "Pathik", "Roku", "Sokka",
"Suki", "Toph", "Zhao", "Zuko")
rownames(mat) = colnames(mat)
mat2 = mat
for(i in 1:18) {
for(j in (i+1):18) {
if(j < 19) {
mat2[i,j] = mat2[i,j] + mat[j,i]
mat2[j,i] = mat2[i,j]
}
}
}
Sem direção (soma das palavras nas duas direções)
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat2, grid.col = grid.col, annotationTrack = c("name", "grid"))
Sem direção e ordenando
mat2 = textshape::cluster_matrix(mat2)
circos.clear()
circos.par(start.degree = 90, clock.wise = TRUE)
chordDiagram(mat2, grid.col = grid.col, annotationTrack = c("name", "grid"))